{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import statistics\n",
    "import glob\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import xgboost\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/javascript": [
       "IPython.notebook.set_autosave_interval(60000)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Autosaving every 60 seconds\n"
     ]
    }
   ],
   "source": [
    "%autosave 60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_features=[\"IMEI\",\"sp\", \"EVVSP\", \"EVGPO\", \"EVACP\", \"EVODO\",\n",
    "             \"EVOAS\", \"EVACO_Z\", \"EVSOH\", \"EVBMA_Latest\", \"EVBMA_Max\",\n",
    "             \"EVBMA_Min\", \"EVIGM_Latest\", \"EVCOM_Latest\", \"EVCFN\", \"EVTRQ\", \n",
    "             \"EVBBV\", \"EVDR1\", \"EVDR2\", \"EVACP\", \"EVHVS\", \"EVPWA_MAX\", \n",
    "             \"EVPWA_MIN\", \"EVBMI_Latest\", \"EVSOMA\", \"EVSOMI\", \"EVIGM_Max\",\n",
    "             \"EVIRT_Max\", \"EVIRT_Min\", \"EVIDC\", \"EVMGT\", \"EVMGS\", \"EVMGF\", \"EVMGR\",\n",
    "             \"EVIND\", \"EVICM\", \"EVDI1\", \"EVDI2\", \"EVDIT\", \"EVDOA\", \"EVACS\", \"EVMSC\",\n",
    "             \"EVRER\", \"EVDRV\", \"EVMTR\", \"EVTRE\", \"EVBFN\", \"EVHTP_AVG\",\n",
    "             \"EVSMA_MAX\"]\n",
    "\n",
    "list_features=list(set(list_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 51.8 s, sys: 4.55 s, total: 56.4 s\n",
      "Wall time: 31.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "path = r'/home/cwshpmu2282/Desktop/EVRangePrediction/data/raw/5_vehicle_data_model_testing'\n",
    "all_files = glob.glob(path + \"/*.csv\")\n",
    "\n",
    "li = []\n",
    "for filename in all_files:\n",
    "    df = pd.read_csv(filename, index_col=None, header=0)\n",
    "    df = df[list_features]\n",
    "    li.append(df)\n",
    "    \n",
    "data = pd.concat(li, axis=0, ignore_index=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.dropna(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 125 ms, sys: 234 ms, total: 359 ms\n",
      "Wall time: 356 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "X = data.drop(columns = [\"EVSMA_MAX\"], axis = 1)\n",
    "y = data.EVSMA_MAX.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_train = xgboost.DMatrix(x_train, label=y_train)\n",
    "xgb_test = xgboost.DMatrix(x_test, label = y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "params= {\n",
    "    \"eta\": 0.002,\n",
    "    \"max_depth\": 3,\n",
    "    \"subsample\": 0.5\n",
    "    \n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttest-rmse:73.8026\n",
      "[1000]\ttest-rmse:10.4831\n",
      "[2000]\ttest-rmse:2.53135\n",
      "[3000]\ttest-rmse:1.71628\n",
      "[4000]\ttest-rmse:1.50925\n",
      "[5000]\ttest-rmse:1.39683\n",
      "[6000]\ttest-rmse:1.3291\n",
      "[7000]\ttest-rmse:1.28031\n",
      "[8000]\ttest-rmse:1.24331\n",
      "[9000]\ttest-rmse:1.21425\n",
      "[9999]\ttest-rmse:1.19057\n"
     ]
    }
   ],
   "source": [
    "model_train=xgboost.train(params, xgb_train, 10000, evals = [(xgb_test, \"test\" )], verbose_eval= 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'xgboost' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-b0d06dbd49fa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mxgboost\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_importance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mimportance_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"cover\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrcParams\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"figure.figsize\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m16\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m16\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msavefig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'xgb_plot_imp.png'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'xgboost' is not defined"
     ]
    }
   ],
   "source": [
    "xgboost.plot_importance(model_train, importance_type = \"cover\")\n",
    "\n",
    "plt.rcParams[\"figure.figsize\"] = (16, 16)\n",
    "plt.show()\n",
    "plt.savefig('xgb_plot_imp.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickel\n",
    "pickel.dump(model_train,open('/home/cwshpmu2282/Desktop/EVRangePrediction/model/xgb.pickel.dat'),'rb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "rnd_clf = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=42)\n",
    "rnd_clf.fit(X,y)\n",
    "\n",
    "importances = rnd_clf.feature_importances_\n",
    "\n",
    "for name, importance in zip(X, rnd_clf.feature_importances_):\n",
    "    print(name, \"=\", importance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
